from utils.data_helper import build_vocab
from utils.data_helper import tokenizer_en
from loguru import logger

if __name__ == '__main__':
    vocab = build_vocab(tokenizer=tokenizer_en,
                        filepath='../data/train.csv',
                        min_freq=1)
    vocab_list = [(token, idx) for token, idx in vocab.get_stoi().items()]
    logger.debug(vocab_list[:10])
    logger.debug(f"<unk>：{vocab['<unk>']}, <pad>：{vocab['<pad>']}")
